Loan Approval Analysis
Context
This dataset was sourced from Kaggle with the purpose of it being a basis to construct machine learning models to predict whether applicants will be approved or rejected based on their loan application. Predictors such as education level, income, employment length etc., can be used to determine whether or not an applicant will be able to repay their loan.
Import Libraries
require(tidyverse)
require(ggplot2)
require(scales)
require(readr)
require(data.table)
require(reshape2)
require(e1071)
require(class)
require(naivebayes)
require(randomForest)
require(kableExtra)Importing Dataset
credit <- read_csv("credit_card/credit_record.csv")
app <- read_csv("credit_card/application_record.csv")Summary of Data
head(credit)## # A tibble: 6 x 3
## ID MONTHS_BALANCE STATUS
## <dbl> <dbl> <chr>
## 1 5001711 0 X
## 2 5001711 -1 0
## 3 5001711 -2 0
## 4 5001711 -3 0
## 5 5001712 0 C
## 6 5001712 -1 C
head(app)## # A tibble: 6 x 18
## ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 5008804 M Y Y 0 427500
## 2 5008805 M Y Y 0 427500
## 3 5008806 M Y Y 0 112500
## 4 5008808 F N Y 0 270000
## 5 5008809 F N Y 0 270000
## 6 5008810 F N Y 0 270000
## # ... with 12 more variables: NAME_INCOME_TYPE <chr>,
## # NAME_EDUCATION_TYPE <chr>, NAME_FAMILY_STATUS <chr>,
## # NAME_HOUSING_TYPE <chr>, DAYS_BIRTH <dbl>, DAYS_EMPLOYED <dbl>,
## # FLAG_MOBIL <dbl>, FLAG_WORK_PHONE <dbl>, FLAG_PHONE <dbl>,
## # FLAG_EMAIL <dbl>, OCCUPATION_TYPE <chr>, CNT_FAM_MEMBERS <dbl>
Renaming Columns
colnames(app) <- (c("ID", "Gender", "Car", "Prop", "Num_Child",
"Income", "Inc_Cat", "Education",
"Marital_Stat", "Housing_Type", "Birthday", "Emp_Start",
"Cell", "Work", "Home", "Email",
"Occupation", "Family"))
colnames(credit) <-(c("ID", "Month_Start", "Status"))Visualizing the Data
By visualizing the variables of the dataset we can gain insights to what the entire dataset looks like rather than looking into the first couple of rows using the head() function.
Gender Distribution
gender_plot <- ggplot(app, aes(Gender)) +
geom_histogram(stat = "count", color = "black", fill = "grey") +
theme_classic() +
labs(x = "Gender",
y = "Count",
title = "Gender Distribution") +
scale_y_continuous(labels = comma)
gender_plotAge Distribution
age_plot <- app %>%
mutate(age = round(abs(Birthday)/365)) %>%
ggplot(aes(age)) +
geom_histogram(stat = "count", color = "black", fill = "grey") +
theme_classic() +
labs(x = "Age",
y = "Count",
title = "Age Distribution") +
scale_y_continuous(labels = comma)
age_plotEmployment Length Distribution
employment_plot <- app %>%
mutate(years = ifelse(Emp_Start >=0, 0, round(abs(Emp_Start)/365))) %>%
filter(years > 0) %>%
ggplot(aes(years)) +
geom_histogram(stat = "count", color = "black", fill = "grey") +
theme_classic() +
labs(x = "Years",
y = "Count",
title = "Employment Length Distribution") +
scale_y_continuous(labels = comma)
employment_plotIDs with 0 years of experience are removed from this graph for visual clarity and those applicants are unemployed according to the dataset description.
Education Type Distribution
education_plot <- app %>% ggplot(aes(Education)) +
geom_histogram(stat = "count", color = "black", fill = "grey") +
labs(x = "Education Type",
y = "Count",
title ="Education Type Distribution") +
theme_classic() +
scale_y_continuous(labels = comma) + coord_flip()
education_plotA majority of the applicants do hold a high school diploma, college degree or higher.
Credit Length
credit_plot <- credit %>% ggplot(aes(abs(Month_Start)/12)) +
geom_histogram(stat = "count", color = "black", fill = "grey") +
labs(x = "Credit Length in Years",
y = "Count",
title = "Credit Length Distribution") +
theme_classic() +
scale_y_continuous(labels = comma)
credit_plotThis does not accurately represent each applicant’s credit history since it contains multiple credit reports per ID, so when joining we take the longest length to best represent each applicant.
Join Loan Application to Applicant’s Info
begin_month <- credit %>%
group_by(ID) %>%
summarise(min_month = min(Month_Start))
new_data <- left_join(app, begin_month, by=c("ID"))By joining datasets we can consolidate the applicant’s ID credit history(s) to one loan application.
Determine Application Approval
credit$at_risk <- NA
credit$at_risk[credit$Status == '2'] = 'Yes'
credit$at_risk[credit$Status == '3'] = 'Yes'
credit$at_risk[credit$Status == '4'] = 'Yes'
credit$at_risk[credit$Status == '5'] = 'Yes'
counter <- credit %>%
group_by(ID) %>%
summarise_all(funs(sum(!is.na(.))))
counter$at_risk[counter$at_risk > 0] = 'Yes'
counter$at_risk[counter$at_risk == 0] = 'No'
counter$at_risk <- as.factor(counter$at_risk)
counter <- counter[c(1,4)]
new_data <- inner_join(new_data, counter, by = 'ID')
new_data$target[new_data$at_risk == 'Yes'] = 1
new_data$target[new_data$at_risk == 'No'] = 0
counter %>% group_by(at_risk) %>% count()## # A tibble: 2 x 2
## # Groups: at_risk [2]
## at_risk n
## <fct> <int>
## 1 No 45318
## 2 Yes 667
There is a small amount of applicants that are at risk of defaulting on a loan.
Omit NA values
full_table <- na.omit(new_data)Binary Features
Gender
full_table$Gender <- factor(full_table$Gender, labels = c('F','M'))
full_table %>% group_by(Gender) %>% count()## # A tibble: 2 x 2
## # Groups: Gender [2]
## Gender n
## <fct> <int>
## 1 F 15630
## 2 M 9504
Car
full_table$Car <- factor(full_table$Car, labels = c('No', 'Yes'))
full_table %>% group_by(Car) %>% count()## # A tibble: 2 x 2
## # Groups: Car [2]
## Car n
## <fct> <int>
## 1 No 14618
## 2 Yes 10516
Property
full_table$Prop <- factor(full_table$Prop, labels = c('No','Yes'))
full_table %>% group_by(Prop) %>% count()## # A tibble: 2 x 2
## # Groups: Prop [2]
## Prop n
## <fct> <int>
## 1 No 8673
## 2 Yes 16461
full_table$Email <- factor(full_table$Email, labels = c('No', 'Yes'))
full_table %>% group_by(Email) %>% count()## # A tibble: 2 x 2
## # Groups: Email [2]
## Email n
## <fct> <int>
## 1 No 22604
## 2 Yes 2530
Work Phone
full_table$Work <- factor(full_table$Work, labels = c('No','Yes'))
full_table %>% group_by(Work) %>% count()## # A tibble: 2 x 2
## # Groups: Work [2]
## Work n
## <fct> <int>
## 1 No 18252
## 2 Yes 6882
Home
full_table$Home <- factor(full_table$Home, labels = c('No', 'Yes'))
full_table %>% group_by(Home) %>% count()## # A tibble: 2 x 2
## # Groups: Home [2]
## Home n
## <fct> <int>
## 1 No 17775
## 2 Yes 7359
Cell
full_table$Cell <- factor(full_table$Cell, labels = c('Yes'))
full_table %>% group_by(Cell) %>% count()## # A tibble: 1 x 2
## # Groups: Cell [1]
## Cell n
## <fct> <int>
## 1 Yes 25134
Since all of the applicants indicated that they have a cellphone, we can remove this as a potential feature for our models since they will not have an impact on the model’s decision.
Continuous Features
To help improve our models, we can bin data to help remove outliers in the dataset. If there is enough rows of data with the same feature, they will not be binned.
Number of Children
unique(full_table$Num_Child)## [1] 0 3 1 2 4 14 5 19 7
full_table$Num_Child <- cut(full_table$Num_Child,
breaks = c(0, 1, 2, 19),
include.lowest = TRUE,
labels = c('0','1','2+'))
full_table %>%
group_by(Num_Child) %>%
count()## # A tibble: 3 x 2
## # Groups: Num_Child [3]
## Num_Child n
## <fct> <int>
## 1 0 22026
## 2 1 2715
## 3 2+ 393
Income
summary(full_table$Income)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27000 135000 180000 194834 225000 1575000
Age
full_table$Age <- round(abs(full_table$Birthday)/365)
summary(full_table$Age)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 33.00 40.00 40.54 48.00 67.00
Employment Length (Months)
full_table$Emp_Start <- round(full_table$Emp_Start/365, digits = 1)
full_table$Emp_Start[full_table$Emp_Start > 0] = 0
full_table$Emp_Start <- abs(full_table$Emp_Start)
summary(full_table$Emp_Start)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.700 5.300 7.191 9.500 43.000
Family Size - Binned
unique(full_table$Family)## [1] 2 1 5 3 4 6 15 7 20 9
full_table$Family <- cut(full_table$Family,
breaks = c(1, 2, 3, 20),
include.lowest = TRUE,
labels = c('1','2','3+'))
full_table %>%
group_by(Family) %>%
count()## # A tibble: 3 x 2
## # Groups: Family [3]
## Family n
## <fct> <int>
## 1 1 16960
## 2 2 5216
## 3 3+ 2958
Categorical Features
Income Category - Binning
unique(full_table$Inc_Cat)## [1] "Working" "Commercial associate" "State servant"
## [4] "Student" "Pensioner"
full_table %>%
group_by(Inc_Cat) %>%
count()## # A tibble: 5 x 2
## # Groups: Inc_Cat [5]
## Inc_Cat n
## <chr> <int>
## 1 Commercial associate 7052
## 2 Pensioner 13
## 3 State servant 2437
## 4 Student 10
## 5 Working 15622
full_table$Inc_Cat[full_table$Inc_Cat == 'Student' |
full_table$Inc_Cat == 'Pensioner'] = 'State servant'Binned Income Categories
full_table$Inc_Cat <- factor(full_table$Inc_Cat)
full_table %>%
group_by(Inc_Cat) %>%
count()## # A tibble: 3 x 2
## # Groups: Inc_Cat [3]
## Inc_Cat n
## <fct> <int>
## 1 Commercial associate 7052
## 2 State servant 2460
## 3 Working 15622
Occupation Type - Binning
unique(full_table$Occupation)## [1] "Security staff" "Sales staff" "Accountants"
## [4] "Laborers" "Managers" "Drivers"
## [7] "Core staff" "High skill tech staff" "Cleaning staff"
## [10] "Private service staff" "Cooking staff" "Low-skill Laborers"
## [13] "Medicine staff" "Secretaries" "Waiters/barmen staff"
## [16] "HR staff" "Realty agents" "IT staff"
full_table$Occupation[full_table$Occupation == 'Laborers' |
full_table$Occupation == 'Low-skill Laborers' |
full_table$Occupation == 'Cleaning staff' |
full_table$Occupation == 'Cooking staff' |
full_table$Occupation == 'Drivers' |
full_table$Occupation == 'Security staff' |
full_table$Occupation == 'Waiters/barmen staff'] = 'Laborer'
full_table$Occupation[full_table$Occupation == 'Accountants' |
full_table$Occupation == 'Core staff' |
full_table$Occupation == 'HR staff' |
full_table$Occupation == 'Medicine staff' |
full_table$Occupation == 'Private service staff' |
full_table$Occupation == 'Realty agents' |
full_table$Occupation == 'Sales staff' |
full_table$Occupation == 'Secretaries'] = 'Office'
full_table$Occupation[full_table$Occupation == 'Managers' |
full_table$Occupation == 'High skill tech staff'|
full_table$Occupation == 'IT staff'] = 'High Tech'Binned Occupation Type
full_table$Occupation <- factor(full_table$Occupation)
full_table %>%
group_by(Occupation) %>%
count()## # A tibble: 3 x 2
## # Groups: Occupation [3]
## Occupation n
## <fct> <int>
## 1 High Tech 4455
## 2 Laborer 10496
## 3 Office 10183
Education Type
full_table$Education[full_table$Education == 'Academic degree'] =
'Higher education'
full_table$Education <- factor(full_table$Education)
full_table %>%
group_by(Education) %>%
count()## # A tibble: 4 x 2
## # Groups: Education [4]
## Education n
## <fct> <int>
## 1 Higher education 7146
## 2 Incomplete higher 993
## 3 Lower secondary 187
## 4 Secondary / secondary special 16808
Housing Type
full_table$Housing_Type <- factor(full_table$Housing_Type)
full_table %>%
group_by(Housing_Type) %>%
count()## # A tibble: 6 x 2
## # Groups: Housing_Type [6]
## Housing_Type n
## <fct> <int>
## 1 Co-op apartment 152
## 2 House / apartment 22102
## 3 Municipal apartment 812
## 4 Office apartment 199
## 5 Rented apartment 439
## 6 With parents 1430
Marital Status
full_table$Marital_Stat <- factor(full_table$Marital_Stat)
full_table %>%
group_by(Marital_Stat) %>%
count()## # A tibble: 5 x 2
## # Groups: Marital_Stat [5]
## Marital_Stat n
## <fct> <int>
## 1 Civil marriage 2133
## 2 Married 17509
## 3 Separated 1467
## 4 Single / not married 3445
## 5 Widow 580
Examine Cleaned Data
reduced_table <- full_table[-c(1, 11, 13, 19, 20)]
summary(reduced_table)## Gender Car Prop Num_Child Income
## F:15630 No :14618 No : 8673 0 :22026 Min. : 27000
## M: 9504 Yes:10516 Yes:16461 1 : 2715 1st Qu.: 135000
## 2+: 393 Median : 180000
## Mean : 194834
## 3rd Qu.: 225000
## Max. :1575000
## Inc_Cat Education
## Commercial associate: 7052 Higher education : 7146
## State servant : 2460 Incomplete higher : 993
## Working :15622 Lower secondary : 187
## Secondary / secondary special:16808
##
##
## Marital_Stat Housing_Type Emp_Start
## Civil marriage : 2133 Co-op apartment : 152 Min. : 0.000
## Married :17509 House / apartment :22102 1st Qu.: 2.700
## Separated : 1467 Municipal apartment: 812 Median : 5.300
## Single / not married: 3445 Office apartment : 199 Mean : 7.191
## Widow : 580 Rented apartment : 439 3rd Qu.: 9.500
## With parents : 1430 Max. :43.000
## Work Home Email Occupation Family
## No :18252 No :17775 No :22604 High Tech: 4455 1 :16960
## Yes: 6882 Yes: 7359 Yes: 2530 Laborer :10496 2 : 5216
## Office :10183 3+: 2958
##
##
##
## target Age
## Min. :0.00000 Min. :21.00
## 1st Qu.:0.00000 1st Qu.:33.00
## Median :0.00000 Median :40.00
## Mean :0.01679 Mean :40.54
## 3rd Qu.:0.00000 3rd Qu.:48.00
## Max. :1.00000 Max. :67.00
ggplot(full_table, aes(x = at_risk, y = Income)) +
geom_boxplot() +
facet_wrap(~Gender) +
labs(title = "Credit Risk Distribution Based on Gender and Income Distribuiton",
x = "Credit Risk",
y = "Income") +
theme_classic()ggplot(full_table,
aes(x = Age, y = Income, color = at_risk)) +
geom_violin() Sampling Data
The dataset is split 70/30 at a random since the number of at risk applicants are so low.
set.seed(920)
sample <- sample.int(n = nrow(full_table), size = floor(.70*nrow(full_table)), replace = F)
train.set <- reduced_table[sample, ]
test.set <- reduced_table[-sample, ]Training Models for Classification
The methods that will be implemented to classify and predict if an applicant is eligible or not are Logistic Regression, Support Vector Machines, Naive Bayes, KNN, and Random Forest (Decision Trees).
Logistic Regression
Logistic regression predicts the value of a categorical variable by finding the relationship between the categorical variable and the independent variables (predictors). They are mainly used in binary classification scenarios.
lm.model <- glm(formula = target ~ .,
family = binomial(link='logit'),
data = train.set)
summary(lm.model)##
## Call:
## glm(formula = target ~ ., family = binomial(link = "logit"),
## data = train.set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3873 -0.2007 -0.1780 -0.1564 3.2992
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.645e+01 2.310e+02 -0.071 0.943218
## GenderM 2.497e-01 1.448e-01 1.724 0.084666
## CarYes -1.556e-01 1.288e-01 -1.208 0.227010
## PropYes -1.967e-01 1.260e-01 -1.561 0.118502
## Num_Child1 6.338e-01 6.272e-01 1.010 0.312278
## Num_Child2+ 1.679e+00 7.346e-01 2.286 0.022277
## Income 1.222e-07 5.910e-07 0.207 0.836119
## Inc_CatState servant 1.440e-01 2.127e-01 0.677 0.498360
## Inc_CatWorking -6.882e-02 1.348e-01 -0.510 0.609712
## EducationIncomplete higher 1.039e-01 2.853e-01 0.364 0.715826
## EducationLower secondary 2.661e-01 6.005e-01 0.443 0.657649
## EducationSecondary / secondary special -1.571e-01 1.401e-01 -1.121 0.262113
## Marital_StatMarried 9.471e-02 2.249e-01 0.421 0.673728
## Marital_StatSeparated -2.916e-01 3.678e-01 -0.793 0.427937
## Marital_StatSingle / not married 4.642e-01 2.571e-01 1.806 0.070949
## Marital_StatWidow 5.565e-01 4.074e-01 1.366 0.171959
## Housing_TypeHouse / apartment 1.275e+01 2.310e+02 0.055 0.955987
## Housing_TypeMunicipal apartment 1.301e+01 2.310e+02 0.056 0.955081
## Housing_TypeOffice apartment 1.293e+01 2.310e+02 0.056 0.955364
## Housing_TypeRented apartment 1.203e+01 2.310e+02 0.052 0.958459
## Housing_TypeWith parents 1.271e+01 2.310e+02 0.055 0.956103
## Emp_Start -4.078e-02 1.184e-02 -3.443 0.000576
## WorkYes 1.173e-01 1.419e-01 0.826 0.408662
## HomeYes -1.587e-02 1.380e-01 -0.115 0.908473
## EmailYes 6.328e-02 1.903e-01 0.332 0.739530
## OccupationLaborer -1.267e-01 1.776e-01 -0.713 0.475767
## OccupationOffice -1.348e-01 1.733e-01 -0.778 0.436720
## Family2 -1.365e-01 1.691e-01 -0.807 0.419426
## Family3+ -7.504e-01 6.752e-01 -1.111 0.266381
## Age 1.173e-03 7.031e-03 0.167 0.867442
##
## (Intercept)
## GenderM .
## CarYes
## PropYes
## Num_Child1
## Num_Child2+ *
## Income
## Inc_CatState servant
## Inc_CatWorking
## EducationIncomplete higher
## EducationLower secondary
## EducationSecondary / secondary special
## Marital_StatMarried
## Marital_StatSeparated
## Marital_StatSingle / not married .
## Marital_StatWidow
## Housing_TypeHouse / apartment
## Housing_TypeMunicipal apartment
## Housing_TypeOffice apartment
## Housing_TypeRented apartment
## Housing_TypeWith parents
## Emp_Start ***
## WorkYes
## HomeYes
## EmailYes
## OccupationLaborer
## OccupationOffice
## Family2
## Family3+
## Age
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3037.7 on 17592 degrees of freedom
## Residual deviance: 2984.2 on 17563 degrees of freedom
## AIC: 3044.2
##
## Number of Fisher Scoring iterations: 15
Predict Logistic Regression Model Performance
log.predict <- predict(lm.model, test.set, type = "response")
log.prediction.rd <- ifelse(log.predict > 0.5, 1, 0)
print(paste('Accuracy:', 1-mean(log.prediction.rd != test.set$target)))## [1] "Accuracy: 0.983821774300491"
The logistic regression model does a good job of classifying the different applicants regardless of the skewed sample size.
Support Vector Machines (SVM)
Support Vector Machines represent data points as objects in space. The data is then split by a function created by the SVM to classify the different spaces according to the target outputs. SVMs are more efficient when using data with high dimensionality.
svmfit = svm(target ~ .,
data = train.set,
kernel = "linear",
type = "C-classification")
summary(svmfit)##
## Call:
## svm(formula = target ~ ., data = train.set, kernel = "linear", type = "C-classification")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 959
##
## ( 659 300 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
SVM Performance
pred <- predict(svmfit, test.set)
svm.table0 <- table(test.set$target, pred)
paste("Accuracy:", sum(diag(svm.table0))/sum(svm.table0))## [1] "Accuracy: 0.983821774300491"
The SVM classification technique also returned a comparable accuracy, model tuning and parameter adjustment can give way to a slightly higher accuracy.
Using an Optimal SVM
optimal.svm <- svm(as.factor(target) ~ .,
data = train.set,
type = "C-classification",
kernel = "linear",
gamma = 0.1,
cost = 1)
summary(optimal.svm)##
## Call:
## svm(formula = as.factor(target) ~ ., data = train.set, type = "C-classification",
## kernel = "linear", gamma = 0.1, cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 959
##
## ( 659 300 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
Optimal SVM Performance
svm.predict <- predict(optimal.svm, test.set[,-16])
svm.table <- table(svm.predict, test.set$target)
paste("Accuracy:", sum(diag(svm.table))/sum(svm.table))## [1] "Accuracy: 0.983821774300491"
K-Nearest Neighbors (KNN)
K-nearest neighbors method is a classification method that relies on the distance between datapoints in order to classify new data points.
train_l <- train.set[c(5,10,17)]
test_l <- test.set[c(5,10,17)]
train_label <- train.set$target
test_label <- test.set$target
knn.model <- knn(train = train_l, test = test_l, cl = train_label, k = 132)
knn.model2 <- knn(train = train_l, test = test_l, cl = train_label, k = 133)
knn.table <- table(knn.model, test_label)
knn.table2 <- table(knn.model2, test_label)
paste("Accuracy for k = 132:", sum(diag(knn.table))/sum(knn.table))## [1] "Accuracy for k = 132: 0.983821774300491"
paste("Accuracy for k = 133:", sum(diag(knn.table2))/sum(knn.table2))## [1] "Accuracy for k = 133: 0.983821774300491"
Naive Bayes (NB)
Naive Bayes uses the Bayes Theorem to solve classification problems by means of conditional probability. This is done by considering the predictor variable independent of one another.
nb.model <- naiveBayes(as.factor(target) ~ .,
data = train.set)
nb.model##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.98294776 0.01705224
##
## Conditional probabilities:
## Gender
## Y F M
## 0 0.6229110 0.3770890
## 1 0.5666667 0.4333333
##
## Car
## Y No Yes
## 0 0.5783265 0.4216735
## 1 0.5933333 0.4066667
##
## Prop
## Y No Yes
## 0 0.3471347 0.6528653
## 1 0.4000000 0.6000000
##
## Num_Child
## Y 0 1 2+
## 0 0.87555658 0.10958191 0.01486150
## 1 0.86666667 0.10000000 0.03333333
##
## Income
## Y [,1] [,2]
## 0 194922.9 104576.29
## 1 199291.5 96269.56
##
## Inc_Cat
## Y Commercial associate State servant Working
## 0 0.28022899 0.09680217 0.62296883
## 1 0.30000000 0.10666667 0.59333333
##
## Education
## Y Higher education Incomplete higher Lower secondary
## 0 0.286879084 0.038281386 0.006997051
## 1 0.316666667 0.050000000 0.010000000
## Education
## Y Secondary / secondary special
## 0 0.667842480
## 1 0.623333333
##
## Marital_Stat
## Y Civil marriage Married Separated Single / not married Widow
## 0 0.08714509 0.69658243 0.05950385 0.13456312 0.02220552
## 1 0.07666667 0.65666667 0.04000000 0.19666667 0.03000000
##
## Housing_Type
## Y Co-op apartment House / apartment Municipal apartment Office apartment
## 0 0.006129648 0.880240560 0.032093911 0.007806627
## 1 0.000000000 0.873333333 0.043333333 0.010000000
## Housing_Type
## Y Rented apartment With parents
## 0 0.016943272 0.056785983
## 1 0.010000000 0.063333333
##
## Emp_Start
## Y [,1] [,2]
## 0 7.188203 6.418510
## 1 5.832667 5.470499
##
## Work
## Y No Yes
## 0 0.7243393 0.2756607
## 1 0.7000000 0.3000000
##
## Home
## Y No Yes
## 0 0.7058347 0.2941653
## 1 0.7033333 0.2966667
##
## Email
## Y No Yes
## 0 0.8983404 0.1016596
## 1 0.8900000 0.1100000
##
## Occupation
## Y High Tech Laborer Office
## 0 0.1764876 0.4173943 0.4061181
## 1 0.1966667 0.4200000 0.3833333
##
## Family
## Y 1 2 3+
## 0 0.6707916 0.2110102 0.1181981
## 1 0.6900000 0.1866667 0.1233333
##
## Age
## Y [,1] [,2]
## 0 40.51096 9.519697
## 1 39.64000 9.407732
NB Model Performance
nb.predict <- predict(nb.model, test.set)
nb.table <- table(test.set$target, nb.predict)
paste("Accuracy:", sum(diag(nb.table))/sum(nb.table))## [1] "Accuracy: 0.983821774300491"
Random Forest Model
Random Forests is a classification method that uses a large number of decision trees. These decision trees are used to identify a classification consensus by selecting a common output from the data.
rf.model <- randomForest(as.factor(target) ~.,
data = train.set)
rf.model##
## Call:
## randomForest(formula = as.factor(target) ~ ., data = train.set)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 1.85%
## Confusion matrix:
## 0 1 class.error
## 0 17233 60 0.003469612
## 1 266 34 0.886666667
Random Forest Model Performance
rf.predict <- predict(rf.model, test.set)
rf.table <- table(test.set$target, rf.predict)
paste("Accuracy:", sum(diag(nb.table))/sum(nb.table))## [1] "Accuracy: 0.983821774300491"
Overall the accuracies of each of the classification methods are negligible. Some of the parameters could be tuned in order to improve their performance.